Pat Dayton

AJ Rahendran

Initial Setup

Set up our necessary packages. Uncomment the install.packages line the first time you run through this. Set your home directory to be the src root of this project. You’ll need to change this before you get started.

Load Data

Load in all of our data. The modulo of the sum of our UTD IDs was 2, so we will be using Tronix, Omisego, and YoCoin for our analysis.

# First our price files
omg_price_df = read.table("./tokenPrices/omisego.txt",
                 col.names = c('Date',  'Open', 'High', 'Low',  'Close',    'Volume',   'MarketCap'),
                 skip = 1,
                 header = FALSE)

trn_price_df = read.table("./tokenPrices/tron",
                 col.names = c('Date',  'Open', 'High', 'Low',  'Close',    'Volume',   'MarketCap'),
                 skip = 1,
                 header = FALSE)

yoc_price_df = read.table("./tokenPrices/yocoin",
                 col.names = c('Date',  'Open', 'High', 'Low',  'Close',    'Volume',   'MarketCap'),
                 skip = 1,
                 header = FALSE)

# Next our edge files

omg_edge_df <- read_delim('./edgeFiles/omisego.txt', delim = " ", col_names = F)
trn_edge_df <- read_delim('./edgeFiles/tron.txt',    delim = " ", col_names = F)
yoc_edge_df <- read_delim('./edgeFiles/yo.txt',      delim = " ", col_names = F)

# and label these as well
names(omg_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
names(trn_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')
names(yoc_edge_df) <- c('fromID', 'toID', 'unixTime', 'tokenAmount')

Prepare the Data

Remove Duplicates

Check for duplicated values in all of our files and remove them.

OMG

cat("omg_price_df duplicates: ", anyDuplicated(omg_price_df), "  \n")
## omg_price_df duplicates:  0
cat("omg_edge_df  duplicates: ", anyDuplicated(omg_price_df), "  \n")
## omg_edge_df  duplicates:  0
omg_price_df <- omg_price_df %>% distinct()
omg_edge_df  <- omg_edge_df  %>% distinct()
cat("omg_edge_df  duplicates: ", anyDuplicated(omg_edge_df),  "  \n") # after duplicates removed
## omg_edge_df  duplicates:  0
cat("omg_price_df duplicates: ", anyDuplicated(omg_price_df), "  \n") # after duplicates removed
## omg_price_df duplicates:  0

TRX

cat("trn_price_df duplicates: ", anyDuplicated(trn_price_df), "  \n")
## trn_price_df duplicates:  0
cat("trn_edge_df  duplicates: ", anyDuplicated(trn_edge_df),  "  \n")
## trn_edge_df  duplicates:  1536
trn_price_df <- trn_price_df %>% distinct()
trn_edge_df  <- trn_edge_df  %>% distinct()
cat("trn_price_df duplicates: ", anyDuplicated(trn_price_df), "  \n") # after duplicates removed
## trn_price_df duplicates:  0
cat("trn_edge_df  duplicates: ", anyDuplicated(trn_edge_df),  "  \n") # after duplicates removed
## trn_edge_df  duplicates:  0

YOC

cat("yoc_price_df duplicates: ", anyDuplicated(yoc_price_df), "  \n")
## yoc_price_df duplicates:  0
cat("yoc_edge_df  duplicates: ", anyDuplicated(yoc_edge_df),  "  \n")
## yoc_edge_df  duplicates:  992
yoc_price_df <- yoc_price_df %>% distinct()
yoc_edge_df  <- yoc_edge_df  %>% distinct()
cat("yoc_price_df duplicates: ", anyDuplicated(yoc_price_df), "  \n") # after duplicates removed
## yoc_price_df duplicates:  0
cat("yoc_edge_df  duplicates: ", anyDuplicated(yoc_edge_df),  "  \n") # after duplicates removed
## yoc_edge_df  duplicates:  0

Reformat Price File Dates

Convert the date to the correct format in the price data frames.

omg_price_df$Date = as.Date(omg_price_df$Date,format='%m/%d/%Y')
trn_price_df$Date = as.Date(trn_price_df$Date,format='%m/%d/%y')
yoc_price_df$Date = as.Date(yoc_price_df$Date,format='%m/%d/%y')

Filter Impossibly Large Transactions

Set our constants for each coin, then remove edge file rows where token amount is too big to make sense. Note: Only YOC had records needing to be removed.

omg_decimals = 10^18
trn_decimals = 10^6
yoc_decimals = 10^16 

omg_supply = 140245398
trn_supply = 66682072191
yoc_supply = 369659255

OMG

omg_edge_df_filtered = omg_edge_df %>% filter(tokenAmount < omg_decimals * omg_supply)
cat("Num Rows before Filtering: ", nrow(omg_edge_df), "\n")
## Num Rows before Filtering:  1143029
cat("Num Rows after Filtering: ", nrow(omg_edge_df_filtered), "\n")
## Num Rows after Filtering:  1143018
cat("Num Rows cut: ", (nrow(omg_edge_df)-nrow(omg_edge_df_filtered)), "\n")
## Num Rows cut:  11
omg_edge_df = omg_edge_df %>% filter(tokenAmount <= omg_decimals * omg_supply)

TRN

trn_edge_df_filtered = trn_edge_df %>% filter(tokenAmount < trn_decimals*trn_supply)
cat("Num Rows before Filtering: ", nrow(trn_edge_df), "\n")
## Num Rows before Filtering:  1512662
cat("Num Rows after Filtering: ", nrow(trn_edge_df_filtered), "\n")
## Num Rows after Filtering:  1512580
cat("Num Rows cut: ", (nrow(trn_edge_df)-nrow(trn_edge_df_filtered)), "\n")
## Num Rows cut:  82
trn_edge_df = trn_edge_df %>% filter(tokenAmount <= trn_decimals * trn_supply)

YOC

yoc_edge_df_filtered = yoc_edge_df %>% filter(yoc_edge_df$tokenAmount < yoc_decimals * yoc_supply)
cat("Num Rows before Filtering: ", nrow(yoc_edge_df), "\n")
## Num Rows before Filtering:  595582
cat("Num Rows after Filtering: ", nrow(yoc_edge_df_filtered), "\n")
## Num Rows after Filtering:  595492
cat("Num Rows cut: ", (nrow(yoc_edge_df)-nrow(yoc_edge_df_filtered)), "\n")
## Num Rows cut:  90
yoc_edge_df = yoc_edge_df %>% filter(tokenAmount <= yoc_decimals * yoc_supply)

Reformat Edge File Dates

Update the edge data frame dates to be the correct format.

omg_edge_df$Date = anydate(omg_edge_df$unixTime)
trn_edge_df$Date = anydate(trn_edge_df$unixTime)
yoc_edge_df$Date = anydate(yoc_edge_df$unixTime)

Plot Price vs. Time

yoc_price_df$Date = as.Date(yoc_price_df$Date,format='%m/%d/%Y')
trn_price_df$Date = as.Date(trn_price_df$Date,format='%m/%d/%Y')
omg_price_df$Date = as.Date(omg_price_df$Date,format='%m/%d/%Y')

yocoin_plot <- ggplot(aes(x=Date, y=Open), data = yoc_price_df) + geom_point(color="darkblue")
yocoin_plot + ggtitle("YOC Historical Data") +
  xlab("Date") + ylab("Opening Price")

tron_plot <- ggplot(aes(x=Date, y=Open), data = trn_price_df) + geom_point(color="darkred")
tron_plot + ggtitle("TRX Historical Data") +
  xlab("Date") + ylab("Opening Price")

omg_plot <- ggplot(aes(x=Date, y=Open), data = omg_price_df) + geom_point(color="darkgreen")
omg_plot + ggtitle("OMG Historical Data") +
  xlab("Date") + ylab("Opening Price")


Part 1 – Buyer/Seller Pair Distribution

Feature Engineering

Create Transaction Pair Data Field

For the sake of basic analysis, we create a value in the dataframe for pairs of buyers and sellers.

yoc_edge_df$pairFrom <- paste(yoc_edge_df$fromID, '-', yoc_edge_df$toID)
yoc_edge_df$pairTo <- paste(yoc_edge_df$toID, '-', yoc_edge_df$fromID)
yocoin_pairFrom_counts <- as.data.frame(table(yoc_edge_df$pairFrom))
yocoin_pairTo_counts <- as.data.frame(table(yoc_edge_df$pairTo))
names(yocoin_pairFrom_counts) <- c('Pair', 'Transactions')
names(yocoin_pairTo_counts) <- c('Pair', 'Transactions')
yocoin_pairFrom_counts <- head(yocoin_pairFrom_counts, n = 100)
yocoin_pairTo_counts <- head(yocoin_pairTo_counts, n = 100)

trn_edge_df$pairFrom <- paste(trn_edge_df$fromID, '-', trn_edge_df$toID)
trn_edge_df$pairTo <- paste(trn_edge_df$toID, '-', trn_edge_df$fromID)
tron_pairFrom_counts <- as.data.frame(table(trn_edge_df$pairFrom))
tron_pairTo_counts <- as.data.frame(table(trn_edge_df$pairTo))
names(tron_pairFrom_counts) <- c('Pair', 'Transactions')
names(tron_pairTo_counts) <- c('Pair', 'Transactions')
tron_pairFrom_counts <- head(tron_pairFrom_counts, n = 100)
tron_pairTo_counts <- head(tron_pairTo_counts, n = 100)

omg_edge_df$pairFrom <- paste(omg_edge_df$fromID, '-', omg_edge_df$toID)
omg_edge_df$pairTo <- paste(omg_edge_df$toID, '-', omg_edge_df$fromID)
omg_pairFrom_counts <- as.data.frame(table(omg_edge_df$pairFrom))
omg_pairTo_counts <- as.data.frame(table(omg_edge_df$pairTo))
names(omg_pairFrom_counts) <- c('Pair', 'Transactions')
names(omg_pairTo_counts) <- c('Pair', 'Transactions')
omg_pairFrom_counts <- head(omg_pairFrom_counts, n = 100)
omg_pairTo_counts <- head(omg_pairTo_counts, n = 100)

Data Visualization

Transactions vs. Buyer/Seller

In this section we plot the Number of Transactions against the pair of buyer and seller. This is just help us understand the distribution of transactions between user pairs. There are two graphs for each coin, respectiv to buyer and seller data.

OMG

omg_plot_transFrom <- 
  ggplot(aes(x=Pair, y=Transactions), 
  data = omg_pairFrom_counts) + 
    geom_bar(stat = "identity", color="green") + 
    geom_text(aes(label=Transactions))

omg_plot_transFrom + 
  ggtitle("OMG Transaction Data") +  
  xlab("Buyer and Seller Pair") + 
  ylab("Token Amount")

omg_plot_transTo <- 
  ggplot(aes(x=Pair, y=Transactions),
  data = omg_pairTo_counts) + 
     geom_bar(stat = "identity", color="green") + 
    geom_text(aes(label=Transactions))

omg_plot_transTo +
  ggtitle("OMG Transaction Data") +
  xlab("Buyer and Seller Pair") +
  ylab("Token Amount")

TRX

tron_plot_transFrom <- 
  ggplot(aes(x=Pair, y=Transactions), 
  data = tron_pairFrom_counts) + 
    geom_bar(stat = "identity", color="darkred") + 
    geom_text(aes(label=Transactions))

tron_plot_transFrom + 
  ggtitle("TRX Transaction Data") +  
  xlab("Buyer and Seller Pair") + 
  ylab("Token Amount")

tron_plot_transTo <- 
  ggplot(aes(x=Pair, y=Transactions), 
    data = tron_pairTo_counts) + 
    geom_bar(stat = "identity", color="darkred") + 
    geom_text(aes(label=Transactions))

tron_plot_transTo +
  ggtitle("TRX Transaction Data") +
  xlab("Buyer and Seller Pair") +
  ylab("Token Amount")

YOC

yocoin_plot_transFrom <- 
  ggplot(aes(x=Pair, y=Transactions), 
  data = yocoin_pairFrom_counts) + 
    geom_bar(stat = "identity", color="darkblue") + 
    geom_text(aes(label = Transactions))

yocoin_plot_transFrom + 
  ggtitle("YOC Transaction Data") + 
  xlab("Buyer and Seller Pair") + 
  ylab("Number of Transactions")

yocoin_plot_transTo <- 
  ggplot(aes(x=Pair, y=Transactions), 
  data = yocoin_pairTo_counts) + 
    geom_bar(stat = "identity", color="darkblue") + 
    geom_text(aes(label = Transactions))

yocoin_plot_transTo + 
  ggtitle("YOC Transaction Data") +
  xlab("Buyer and Seller Pair") + 
  ylab("Number of Transactions")

Transactions vs. Buyer/Seller (Top-K only)

In this section we take the the same data from above and take into a narrower scope, only 20 of the top transaction amounts. From this we begin calculating the distributions of the transaction data.

yocoin_buys.distribution <- yoc_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
yocoin_sells.distribution <- yoc_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup

tron_buys.distribution <- trn_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
tron_sells.distribution <- trn_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup

omg_buys.distribution <- omg_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
omg_sells.distribution <- omg_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup

print("Buys Top 20")
## [1] "Buys Top 20"
print(yocoin_buys.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
##       toID     n
##      <dbl> <int>
##  1 9911653 14601
##  2  309659  6592
##  3 9912976  4423
##  4 9916042  4044
##  5 9915788  3990
##  6 9913800  3351
##  7 9911955  2518
##  8 9912979  2491
##  9 9911654  2153
## 10 9911658  2104
## 11 9916222  2076
## 12 9915420  2043
## 13 9912036  1910
## 14 9914909  1818
## 15 9915919  1736
## 16 9916338  1690
## 17 9915232  1668
## 18 9915389  1652
## 19 9913169  1651
## 20 9917297  1494
print(tron_buys.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
##       toID     n
##      <dbl> <int>
##  1       5 86073
##  2 1752093 11234
##  3 1742290  8168
##  4  182337  7391
##  5 9353350  6093
##  6   75994  5967
##  7   40112  4660
##  8      26  4016
##  9      49  3292
## 10   40002  2804
## 11   75995  2520
## 12    1820  2518
## 13 1742287  2448
## 14  104502  1976
## 15  297031  1762
## 16 9245671  1747
## 17   40044  1718
## 18  104531  1690
## 19      60  1649
## 20     118  1539
print(omg_buys.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
##       toID     n
##      <dbl> <int>
##  1  297278 64501
##  2       5 53758
##  3  311608 32541
##  4   36161 22877
##  5   75994 11124
##  6  297094  7893
##  7  296381  5523
##  8 1742290  5180
##  9  182337  4508
## 10 1739369  3729
## 11  297031  3619
## 12  142341  3394
## 13      49  2692
## 14 1741637  2295
## 15  303329  2111
## 16   75989  2016
## 17  104531  1943
## 18  298450  1896
## 19  297301  1866
## 20   48315  1496
print("Sells Top 20")
## [1] "Sells Top 20"
print(yocoin_sells.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
##     fromID      n
##      <dbl>  <int>
##  1 9911594 384348
##  2 9912282  36887
##  3 9916066  28026
##  4 9921392  23708
##  5 9915044  23346
##  6 9916176  21047
##  7 9915539   9228
##  8 9915042   9127
##  9 9916067   8404
## 10  309659   4093
## 11 9913938   2647
## 12 9913936   2418
## 13 9911653   1692
## 14 9913503   1349
## 15 9926337    822
## 16 9921021    648
## 17 9915105    602
## 18 9913429    591
## 19 9915975    570
## 20 9914909    567
print(tron_sells.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
##     fromID      n
##      <dbl>  <int>
##  1       5 156005
##  2 1742290  86914
##  3      82  56066
##  4      44  49618
##  5      49  45023
##  6       6  42470
##  7      17  23051
##  8      13  20717
##  9 9472639  20235
## 10 9472643  20228
## 11 9472641  20226
## 12 9472657  20220
## 13 9472667  20220
## 14 9472659  20190
## 15 9472637  20178
## 16 9472649  20168
## 17 9472665  20120
## 18 9472663  20040
## 19 9472633  19975
## 20 9472647  19948
print(omg_sells.distribution %>% arrange(-n) %>% head(20))
## # A tibble: 20 x 2
##    fromID      n
##     <dbl>  <int>
##  1     17 203746
##  2 297278  52256
##  3      5  48461
##  4 311608  37141
##  5  36161  32198
##  6     13  21813
##  7  75994  19347
##  8 307831  18201
##  9 296792  18097
## 10 304118  17288
## 11 307797  16222
## 12     44  13037
## 13 296381  12880
## 14     82  12605
## 15      6  10805
## 16 297094  10665
## 17 256505  10631
## 18 297031   8572
## 19  75989   8068
## 20     49   6263

Visualize the Data

This section takes the seller and buyer distribution data, orders it by decreasing amount and finalizes the amount to 20. Once theses calculations are completed, we plot these data frames into respective bar charts and provide numeric values at the top of the bar. This data is far more useful in showing the transaction data.

yocoin_sell_df = yocoin_sells.distribution %>% arrange(-n) %>% head(20)
yocoin_sell_df$row_id <- as.numeric(row.names(yocoin_sell_df))
yocoin_sells_quant_bar = ggplot(data=yocoin_sell_df, aes(x=row_id, y=n)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=n), vjust=-0.3, size=3.5)+
  theme_minimal()
print(yocoin_sells_quant_bar)

yocoin_buy_df = yocoin_buys.distribution %>% arrange(-n) %>% head(20)
yocoin_buy_df$row_id <- as.numeric(row.names(yocoin_buy_df))
yocoin_buys_quant_bar = ggplot(data=yocoin_buy_df, aes(x=row_id, y=n)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=n), vjust=-0.3, size=3.5)+
  theme_minimal()
print(yocoin_buys_quant_bar)

tron_sell_df = yocoin_sells.distribution %>% arrange(-n) %>% head(20)
tron_sell_df$row_id <- as.numeric(row.names(tron_sell_df))
tron_sells_quant_bar = ggplot(data=tron_sell_df, aes(x=row_id, y=n)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=n), vjust=-0.3, size=3.5)+
  theme_minimal()
print(tron_sells_quant_bar)

tron_buy_df = yocoin_buys.distribution %>% arrange(-n) %>% head(20)
tron_buy_df$row_id <- as.numeric(row.names(tron_buy_df))
tron_buys_quant_bar = ggplot(data=tron_buy_df, aes(x=row_id, y=n)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=n), vjust=-0.3, size=3.5)+
  theme_minimal()
print(tron_buys_quant_bar)

omg_sell_df = omg_sells.distribution %>% arrange(-n) %>% head(20)
omg_sell_df$row_id <- as.numeric(row.names(omg_sell_df))
omg_sells_quant_bar = ggplot(data=omg_sell_df, aes(x=row_id, y=n)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=n), vjust=-0.3, size=3.5)+
  theme_minimal()
print(omg_sells_quant_bar)

omg_buy_df = omg_buys.distribution %>% arrange(-n) %>% head(20)
omg_buy_df$row_id <- as.numeric(row.names(omg_buy_df))
omg_buys_quant_bar = ggplot(data=omg_buy_df, aes(x=row_id, y=n)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=n), vjust=-0.3, size=3.5)+
  theme_minimal()
print(omg_buys_quant_bar)

This section is dedicated to making getting proper buying distributions (ordered and truncated to the top 20). We calculate the total volume of the transactions. After that we filter the data, apply some scaling attributes and normalize the data.

yocoin_by_pair_df = yoc_edge_df %>% group_by(pairFrom) %>% summarise(n = n()) %>% arrange(-n) %>% ungroup
print(yocoin_by_pair_df %>% head(20))
## # A tibble: 20 x 2
##    pairFrom              n
##    <chr>             <int>
##  1 9911594 - 9912976  4419
##  2 9911594 - 9916042  4035
##  3 9911594 - 9915788  3977
##  4 9911594 - 9913800  3306
##  5 9913938 - 9911653  2647
##  6 9913936 - 9911653  2418
##  7 9911594 - 9912979  2363
##  8 9911594 - 9911955  2315
##  9 9911594 - 9916222  1819
## 10 9911594 - 9911654  1797
## 11 9911594 - 9911658  1690
## 12 9911594 - 9913169  1630
## 13 9911594 - 9912036  1600
## 14 9911594 - 9915389  1594
## 15 9911594 - 9915919  1594
## 16 9911594 - 9915232  1550
## 17 9911594 - 9915420  1493
## 18 9911594 - 9913658  1485
## 19 9911594 - 9913895  1413
## 20 9911594 - 9916338  1382
tron_by_pair_df = trn_edge_df %>% group_by(pairFrom) %>% summarise(n = n()) %>% arrange(-n) %>% ungroup
print(yocoin_by_pair_df %>% head(20))
## # A tibble: 20 x 2
##    pairFrom              n
##    <chr>             <int>
##  1 9911594 - 9912976  4419
##  2 9911594 - 9916042  4035
##  3 9911594 - 9915788  3977
##  4 9911594 - 9913800  3306
##  5 9913938 - 9911653  2647
##  6 9913936 - 9911653  2418
##  7 9911594 - 9912979  2363
##  8 9911594 - 9911955  2315
##  9 9911594 - 9916222  1819
## 10 9911594 - 9911654  1797
## 11 9911594 - 9911658  1690
## 12 9911594 - 9913169  1630
## 13 9911594 - 9912036  1600
## 14 9911594 - 9915389  1594
## 15 9911594 - 9915919  1594
## 16 9911594 - 9915232  1550
## 17 9911594 - 9915420  1493
## 18 9911594 - 9913658  1485
## 19 9911594 - 9913895  1413
## 20 9911594 - 9916338  1382
omg_by_pair_df = omg_edge_df %>% group_by(pairFrom) %>% summarise(n = n()) %>% arrange(-n) %>% ungroup
print(yocoin_by_pair_df %>% head(20))
## # A tibble: 20 x 2
##    pairFrom              n
##    <chr>             <int>
##  1 9911594 - 9912976  4419
##  2 9911594 - 9916042  4035
##  3 9911594 - 9915788  3977
##  4 9911594 - 9913800  3306
##  5 9913938 - 9911653  2647
##  6 9913936 - 9911653  2418
##  7 9911594 - 9912979  2363
##  8 9911594 - 9911955  2315
##  9 9911594 - 9916222  1819
## 10 9911594 - 9911654  1797
## 11 9911594 - 9911658  1690
## 12 9911594 - 9913169  1630
## 13 9911594 - 9912036  1600
## 14 9911594 - 9915389  1594
## 15 9911594 - 9915919  1594
## 16 9911594 - 9915232  1550
## 17 9911594 - 9915420  1493
## 18 9911594 - 9913658  1485
## 19 9911594 - 9913895  1413
## 20 9911594 - 9916338  1382
yocoin_total_trade_volume = sum(yocoin_by_pair_df$n)
tron_total_trade_volume = sum(tron_by_pair_df$n)
omg_total_trade_volume = sum(omg_by_pair_df$n)

# Optionally Drop out the outlier pair(311608 - 311608), n(30024)
# Comment this line out if you want to leave it in
#yocoin_by_pair_df = yocoin_by_pair_df %>% filter(n < 30000)
#cat('FILTERING!\n')


# Filtering and scaling the data in different ways.
yocoin_pair_df = yocoin_by_pair_df %>% head(100)
yocoin_pair_df$row_id <- as.numeric(row.names(yocoin_pair_df))
yocoin_pair_df$n_scaled <- (yocoin_pair_df$n - min(yocoin_pair_df$n) + 0.001) / (max(yocoin_pair_df$n) - min(yocoin_pair_df$n) + 0.002)

tron_pair_df = tron_by_pair_df %>% head(100)
tron_pair_df$row_id <- as.numeric(row.names(tron_pair_df))
tron_pair_df$n_scaled <- (tron_pair_df$n - min(tron_pair_df$n) + 0.001) / (max(tron_pair_df$n) - min(tron_pair_df$n) + 0.002)

omg_pair_df = yocoin_by_pair_df %>% head(100)
omg_pair_df$row_id <- as.numeric(row.names(omg_pair_df))
omg_pair_df$n_scaled <- (omg_pair_df$n - min(omg_pair_df$n) + 0.001) / (max(omg_pair_df$n) - min(omg_pair_df$n) + 0.002)
# Reverse it so it goes up and right
# pairdf$n_scaled <- rev(pairdf$n_scaled)
# normalize by total_trade_volume
yocoin_pair_df$n_norm = yocoin_pair_df$n / yocoin_total_trade_volume
tron_pair_df$n_norm = tron_pair_df$n / tron_total_trade_volume
omg_pair_df$n_norm = omg_pair_df$n / omg_total_trade_volume

Fit Distributions

From here we take the data, and apply some different distributions. - Log Normal - Exponential - Geometric - Weibull - Gamma - Negative Binomial

as.data.frame(yocoin_pair_df)
as.data.frame(tron_pair_df)
as.data.frame(omg_pair_df)
keeps <- c("n_scaled", "n", "row_id")

yocoin_clean_data <- as.data.frame(yocoin_pair_df)[keeps]
tron_clean_data <- as.data.frame(tron_pair_df)[keeps]
omg_clean_data <- as.data.frame(omg_pair_df)[keeps]


fit.lnorm.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='lognormal')
fit.exp.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='exponential')
# fit.geom.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='geometric')
# fit.weibull.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='weibull')
fit.gamma.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='gamma')
fit.nbinomial.yocoin_pair_df = fitdistr(yocoin_clean_data$n, densfun='negative binomial')

fit.lnorm.tron_pair_df = fitdistr(tron_clean_data$n, densfun='lognormal')
fit.exp.tron_pair_df = fitdistr(tron_clean_data$n, densfun='exponential')
# fit.geom.tron_pair_df = fitdistr(tron_clean_data$n, densfun='geometric')
fit.weibull.tron_pair_df = fitdistr(tron_clean_data$n, densfun='weibull')
fit.gamma.tron_pair_df = fitdistr(tron_clean_data$n, densfun='gamma')
fit.nbinomial.tron_pair_df = fitdistr(tron_clean_data$n, densfun='negative binomial')

fit.lnorm.omg_pair_df = fitdistr(omg_clean_data$n, densfun='lognormal')
fit.exp.omg_pair_df = fitdistr(omg_clean_data$n, densfun='exponential')
# fit.geom.omg_pair_df = fitdistr(omg_clean_data$n, densfun='geometric')
# fit.weibull.omg_pair_df = fitdistr(omg_clean_data$n, densfun='weibull')
fit.gamma.omg_pair_df = fitdistr(omg_clean_data$n, densfun='gamma')
fit.nbinomial.omg_pair_df = fitdistr(omg_clean_data$n, densfun='negative binomial')

#print(fit.lnorm.pairdf)
# fit.weibull.pairdf = fitdistr(clean_data$n, densfun='weibull', start=list(shape=1, scale=500))
# print(fit.weibull.pairdf$estimate[1])
#print(fit.lnorm.pairdf$estimate[1])
#print(fit.exp.pairdf$estimate[1])
#print(fit.geom.pairdf$estimate[1])

This section is just to set up and show the plot for each of the coins and their processed data.

yocoin_pair_bar = ggplot(yocoin_clean_data) + geom_histogram(mapping = aes(x = n), stat = "density", fill="steelblue") +
  stat_function( fun = "dlnorm",
                 args = list(meanlog = fit.lnorm.yocoin_pair_df$estimate[1], sdlog = fit.lnorm.yocoin_pair_df$estimate[2]),
                 n = 100,
                 size = 1,
                 color = "red") +
  stat_function(fun = "dexp",
                size = 1,
                args = list(rate = fit.exp.yocoin_pair_df$estimate[1]),
                color = "green") +
  # stat_function(fun = "dweibull",
  #               size = 1,
  #               args = list(shape = fit.weibull.yocoin_pair_df$estimate[1],
  #               scale=fit.weibull.yocoin_pair_df$estimate[2]),
  #               color = "orange") +
  stat_function(fun = "dgamma",
                size = 1,
                args = list(shape = fit.gamma.yocoin_pair_df$estimate[1],
                rate=fit.gamma.yocoin_pair_df$estimate[2]),
                color = "purple") +
  #The ones below here are somewhat uselss for the full dataset
  # stat_function(fun = "dgeom",
  #               size = 2,
  #               args = list(prob = fit.geom.yocoin_pair_df$estimate[1]),
  #               color = "blue") +
  stat_function(fun = "dnbinom",
                size = 1,
                args = list(size = fit.nbinomial.yocoin_pair_df$estimate[1],
                mu=fit.nbinomial.yocoin_pair_df$estimate[2]),
                color = "pink") +
  theme_minimal()

tron_pair_bar = ggplot(tron_clean_data) + geom_histogram(mapping = aes(x = n), stat = "density", fill="steelblue") +
  stat_function( fun = "dlnorm",
                 args = list(meanlog = fit.lnorm.tron_pair_df$estimate[1], sdlog = fit.lnorm.tron_pair_df$estimate[2]),
                 n = 100,
                 size = 1,
                 color = "red") +
  stat_function(fun = "dexp",
                size = 1,
                args = list(rate = fit.exp.tron_pair_df$estimate[1]),
                color = "green") +
  stat_function(fun = "dweibull",
                size = 1,
                args = list(shape = fit.weibull.tron_pair_df$estimate[1],
                scale=fit.weibull.tron_pair_df$estimate[2]),
                color = "orange") +
  stat_function(fun = "dgamma",
                size = 1,
                args = list(shape = fit.gamma.tron_pair_df$estimate[1],
                rate=fit.gamma.tron_pair_df$estimate[2]),
                color = "purple") +
  #The ones below here are somewhat uselss for the full dataset
  # stat_function(fun = "dgeom",
  #               size = 2,
  #               args = list(prob = fit.geom.tron_pair_df$estimate[1]),
  #               color = "blue") +
  stat_function(fun = "dnbinom",
                size = 1,
                args = list(size = fit.nbinomial.tron_pair_df$estimate[1],
                mu=fit.nbinomial.tron_pair_df$estimate[2]),
                color = "pink") +
  theme_minimal()

omg_pair_bar = ggplot(omg_clean_data) + geom_histogram(mapping = aes(x = n), stat = "density", fill="steelblue") +
  stat_function( fun = "dlnorm",
                 args = list(meanlog = fit.lnorm.omg_pair_df$estimate[1], sdlog = fit.lnorm.omg_pair_df$estimate[2]),
                 n = 100,
                 size = 1,
                 color = "red") +
  stat_function(fun = "dexp",
                size = 1,
                args = list(rate = fit.exp.omg_pair_df$estimate[1]),
                color = "green") +
  # stat_function(fun = "dweibull",
  #               size = 1,
  #               args = list(shape = fit.weibull.omg_pair_df$estimate[1],
  #               scale=fit.weibull.omg_pair_df$estimate[2]),
  #               color = "orange") +
  stat_function(fun = "dgamma",
                size = 1,
                args = list(shape = fit.gamma.omg_pair_df$estimate[1],
                rate=fit.gamma.omg_pair_df$estimate[2]),
                color = "purple") +
  #The ones below here are somewhat uselss for the full dataset
  # stat_function(fun = "dgeom",
  #               size = 2,
  #               args = list(prob = fit.geom.omg_pair_df$estimate[1]),
  #               color = "blue") +
  stat_function(fun = "dnbinom",
                size = 1,
                args = list(size = fit.nbinomial.omg_pair_df$estimate[1],
                mu=fit.nbinomial.omg_pair_df$estimate[2]),
                color = "pink") +
  theme_minimal()

# Can do this later...
# list(mean = fit$estimate[1], sd = fit$estimate[2])) 

print(yocoin_pair_bar)

print(tron_pair_bar)

print(omg_pair_bar)

## Conclusions

Log norm seems to work best for all coins…


Part 2 – Multiple Linear Regression

Feature Engineering

Determine some extrea features on which we can create our multiple linear regressions.

Determine Buys/Sells by Top Buyers/Sellers

Calculate number of buys and sells by user_id Great description here: https://stackoverflow.com/questions/25869378/what-does-n-n-mean-in-r

omg_buys  <- omg_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
trn_buys  <- trn_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup
yoc_buys  <- yoc_edge_df %>% group_by(toID) %>% summarise(n = n()) %>% ungroup

omg_sells <- omg_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
trn_sells <- trn_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup
yoc_sells <- yoc_edge_df %>% group_by(fromID) %>% summarise(n = n()) %>% ungroup

Filter by Top-K Buyers and Build Features

Filter to only include top K buyers and build a dataframe with the summarized data for fitting a regression model. Features we create here include: - Avg_Tok_Amt: Average Token Amount traded for the top-K users on the given day - Tot_Tok_Amt: Total Token Amount traded by the top-K users on the given day - Transactions: Number of transactions by the top-K users on the given day - Distinct Buyers: Distinct number of buyers for a given day - Distinct Sellers: Distinct number of sellers for a given day

K_omg = 104
K_trn = 18000
K_yoc = 136

# Filter to only include top K buyers
omg_buys = omg_buys %>% arrange(-n) %>% head(K_omg)
trn_buys = trn_buys %>% arrange(-n) %>% head(K_trn)
yoc_buys = yoc_buys %>% arrange(-n) %>% head(K_yoc)

omg_top_k_buys <- omg_edge_df %>% filter(omg_edge_df$toID %in% omg_buys$toID)
trn_top_k_buys <- trn_edge_df %>% filter(trn_edge_df$toID %in% trn_buys$toID)
yoc_top_k_buys <- yoc_edge_df %>% filter(yoc_edge_df$toID %in% yoc_buys$toID)


# Create a dataframe with summarized data for fitting a regression model
omg_fit_data <- omg_top_k_buys %>% group_by(Date) %>% 
  summarise(
    Avg_Tok_Amt = mean(tokenAmount),
    Tot_Tok_Amt = sum(tokenAmount),
    Transactions = n(), 
    Distinct_Buyers = n_distinct(toID),
    Distinct_Sellers = n_distinct(fromID)
  ) %>% 
  ungroup

trn_fit_data <- trn_top_k_buys %>% group_by(Date) %>% 
  summarise(
    Avg_Tok_Amt = mean(tokenAmount),
    Tot_Tok_Amt = sum(tokenAmount),
    Transactions = n(), 
    Distinct_Buyers = n_distinct(toID),
    Distinct_Sellers = n_distinct(fromID)
  ) %>% 
  ungroup

yoc_fit_data <- yoc_top_k_buys %>% group_by(Date) %>% 
  summarise(
    Avg_Tok_Amt = mean(tokenAmount),
    Tot_Tok_Amt = sum(tokenAmount),
    Transactions = n(), 
    Distinct_Buyers = n_distinct(toID),
    Distinct_Sellers = n_distinct(fromID)
  ) %>% 
  ungroup

Join Tables

Join edge data to pricing data based on the Date. We lose a small percentage of the data here due to the fact that the timeframes for the two data files do not match perfectly.

omg_fit_data <- merge(omg_fit_data, omg_price_df, by="Date")
trn_fit_data <- merge(trn_fit_data, trn_price_df, by="Date")
yoc_fit_data <- merge(yoc_fit_data, yoc_price_df, by="Date")

Add Historic Data to Data Frame

Calculate the close values of the previous 3 days. Note: m1 refers to minus 1, i.e. one day previous

omg_fit_data$Close_m1 <- shift(omg_fit_data$Close, n=1)
omg_fit_data$Close_m2 <- shift(omg_fit_data$Close, n=2)
omg_fit_data$Close_m3 <- shift(omg_fit_data$Close, n=3)

trn_fit_data$Close_m1 <- shift(trn_fit_data$Close, n=1)
trn_fit_data$Close_m2 <- shift(trn_fit_data$Close, n=2)
trn_fit_data$Close_m3 <- shift(trn_fit_data$Close, n=3)

yoc_fit_data$Close_m1 <- shift(yoc_fit_data$Close, n=1)
yoc_fit_data$Close_m2 <- shift(yoc_fit_data$Close, n=2)
yoc_fit_data$Close_m3 <- shift(yoc_fit_data$Close, n=3)

Inspecting the Data

Let’s take a look at our data with our newly engineered features on which we will fit our multiple regression model.

omg_fit_data
trn_fit_data
yoc_fit_data

Let’s also take a look at how many days are tracked for the three tokens in our data sets. We have the most data for YOC.

cat("OMG Rows: ", nrow(omg_fit_data), "\n")
## OMG Rows:  297
cat("TRX Rows: ", nrow(trn_fit_data), "\n")
## TRX Rows:  236
cat("YOC Rows: ", nrow(yoc_fit_data), "\n")
## YOC Rows:  422

Correlation of Regressors

We chose to regress to the Close value of the token, so we will compare the correlation of each of the regressors (Xs) to the Close (Y).

We can make the observation from this data that the previous day’s prices are far more correlated to the Close price on the day when compared to the token amounts, distinct buyers, and other features we engineered. This is expected.

OMG

cat("Transactions:         ", cor(omg_fit_data$Close, omg_fit_data$Transactions), "\n")
## Transactions:          0.3133622
cat("Total Token Amount:  ", cor(omg_fit_data$Close, omg_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount:   -0.2566245
cat("Average Token Amount:", cor(omg_fit_data$Close, omg_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.4220751
cat("Distinct Buyers:      ", cor(omg_fit_data$Close, omg_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers:       0.6134334
cat("Distinct Sellers:     ", cor(omg_fit_data$Close, omg_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers:      0.2075846
cat("Close Minus 1:        ", cor(omg_fit_data$Close, omg_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1:         0.9786079
cat("Close Minus 2:        ", cor(omg_fit_data$Close, omg_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2:         0.9591295
cat("Close Minus 3:        ", cor(omg_fit_data$Close, omg_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3:         0.9387131

TRX

cat("Transactions:         ", cor(trn_fit_data$Close, trn_fit_data$Transactions), "\n")
## Transactions:          0.5137099
cat("Total Token Amount:   ", cor(trn_fit_data$Close, trn_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount:    0.194449
cat("Average Token Amount:", cor(trn_fit_data$Close, trn_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.1125508
cat("Distinct Buyers:      ", cor(trn_fit_data$Close, trn_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers:       0.8720125
cat("Distinct Sellers:     ", cor(trn_fit_data$Close, trn_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers:      0.243085
cat("Close Minus 1:        ", cor(trn_fit_data$Close, trn_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1:         0.9615547
cat("Close Minus 2:        ", cor(trn_fit_data$Close, trn_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2:         0.9165609
cat("Close Minus 3:        ", cor(trn_fit_data$Close, trn_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3:         0.86749

YOC

cat("Transactions:        ", cor(yoc_fit_data$Close, yoc_fit_data$Transactions), "\n")
## Transactions:         -0.03078097
cat("Total Token Amount:  ", cor(yoc_fit_data$Close, yoc_fit_data$Tot_Tok_Amt), "\n")
## Total Token Amount:   -0.2928949
cat("Average Token Amount:", cor(yoc_fit_data$Close, yoc_fit_data$Avg_Tok_Amt), "\n")
## Average Token Amount: -0.2878166
cat("Distinct Buyers:      ", cor(yoc_fit_data$Close, yoc_fit_data$Distinct_Buyers), "\n")
## Distinct Buyers:       0.3815441
cat("Distinct Sellers:     ", cor(yoc_fit_data$Close, yoc_fit_data$Distinct_Sellers), "\n")
## Distinct Sellers:      0.05851667
cat("Close Minus 1:        ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m1, use = "complete.obs"), "\n")
## Close Minus 1:         0.9806905
cat("Close Minus 2:        ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m2, use = "complete.obs"), "\n")
## Close Minus 2:         0.9751215
cat("Close Minus 3:        ", cor(yoc_fit_data$Close, yoc_fit_data$Close_m3, use = "complete.obs"), "\n")
## Close Minus 3:         0.9702466

Create the Multiple Linear Regression Model

Time to actually perform the fit via multiple linear regression. We will split each coin into two different models. The first considering the previous 3 days of Close prices. The second only focusing on the features we engineered. As the previous three days were so highly correlated with the price, they make the R^2 value significantly higher and we lose some understanding of which one of the engineered features actually contributes the most.

Note: The models ending in “_hist" take the price history for the three previous days into account. Those with “_no_hist" endings do not.

omg_fit_hist <- lm(
  Close ~ Avg_Tok_Amt +
    Tot_Tok_Amt +
    Transactions +
    Distinct_Buyers +
    Distinct_Sellers +
    Close_m1 +
    Close_m2 +
    Close_m3,
  data=omg_fit_data)

omg_fit_no_hist <- lm(
  Close ~ Avg_Tok_Amt +
    Tot_Tok_Amt +
    Transactions +
    Distinct_Buyers +
    Distinct_Sellers,
  data=omg_fit_data)

trn_fit_hist <- lm(
  Close ~ Avg_Tok_Amt +
    Tot_Tok_Amt +
    Transactions +
    Distinct_Buyers +
    Distinct_Sellers +
    Close_m1 +
    Close_m2 +
    Close_m3,
  data=trn_fit_data)

trn_fit_no_hist <- lm(
  Close ~ Avg_Tok_Amt +
    Tot_Tok_Amt +
    Transactions +
    Distinct_Buyers +
    Distinct_Sellers,
  data=trn_fit_data)

yoc_fit_hist <- lm(
  Close ~ Avg_Tok_Amt +
    Tot_Tok_Amt +
    Transactions +
    Distinct_Buyers +
    Distinct_Sellers +
    Close_m1 +
    Close_m2 +
    Close_m3,
  data=yoc_fit_data)

yoc_fit_no_hist <- lm(
  Close ~ Avg_Tok_Amt +
    Tot_Tok_Amt +
    Transactions +
    Distinct_Buyers +
    Distinct_Sellers,
  data=yoc_fit_data)

OMG Summary Data and Plots for Model with Close History

print(summary(omg_fit_hist))
## 
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions + 
##     Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 + 
##     Close_m3, data = omg_fit_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8643 -0.4799 -0.0258  0.5055  4.9258 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -7.347e-02  2.614e-01  -0.281   0.7788    
## Avg_Tok_Amt      -6.106e-23  5.994e-23  -1.019   0.3092    
## Tot_Tok_Amt       1.926e-25  8.781e-26   2.193   0.0291 *  
## Transactions      1.266e-04  8.702e-05   1.455   0.1467    
## Distinct_Buyers   1.225e-02  7.139e-03   1.716   0.0872 .  
## Distinct_Sellers  7.825e-05  1.405e-04   0.557   0.5780    
## Close_m1          9.173e-01  5.791e-02  15.842   <2e-16 ***
## Close_m2          3.376e-02  7.878e-02   0.429   0.6686    
## Close_m3         -2.060e-03  5.695e-02  -0.036   0.9712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.056 on 285 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:   0.96,  Adjusted R-squared:  0.9589 
## F-statistic:   855 on 8 and 285 DF,  p-value: < 2.2e-16
plot(omg_fit_hist)

OMG Summary Data and Plots for Model without Close History

print(summary(omg_fit_no_hist))
## 
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions + 
##     Distinct_Buyers + Distinct_Sellers, data = omg_fit_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2808 -3.2030 -0.2738  2.4096 11.4253 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.341e+00  8.543e-01   3.911 0.000115 ***
## Avg_Tok_Amt      -2.385e-22  1.331e-22  -1.791 0.074285 .  
## Tot_Tok_Amt      -1.296e-25  2.897e-25  -0.447 0.654941    
## Transactions      1.448e-03  3.159e-04   4.585 6.75e-06 ***
## Distinct_Buyers   2.335e-01  2.283e-02  10.227  < 2e-16 ***
## Distinct_Sellers -1.334e-03  5.184e-04  -2.573 0.010573 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.99 on 291 degrees of freedom
## Multiple R-squared:  0.4418, Adjusted R-squared:  0.4322 
## F-statistic: 46.06 on 5 and 291 DF,  p-value: < 2.2e-16
plot(omg_fit_no_hist)

TRX Summary Data and Plots for Model with Close History

print(summary(trn_fit_hist))
## 
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions + 
##     Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 + 
##     Close_m3, data = trn_fit_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.030480 -0.002582  0.000472  0.001864  0.066202 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.382e-03  8.301e-04  -1.665  0.09741 .  
## Avg_Tok_Amt       1.331e-18  3.627e-17   0.037  0.97075    
## Tot_Tok_Amt       1.182e-19  1.597e-19   0.740  0.45994    
## Transactions     -3.004e-06  1.580e-06  -1.901  0.05861 .  
## Distinct_Buyers   3.328e-05  5.320e-06   6.256 1.98e-09 ***
## Distinct_Sellers  2.639e-06  1.730e-06   1.526  0.12846    
## Close_m1          8.077e-01  5.785e-02  13.962  < 2e-16 ***
## Close_m2          4.101e-02  7.806e-02   0.525  0.59987    
## Close_m3         -1.467e-01  5.462e-02  -2.685  0.00779 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.008241 on 224 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9525, Adjusted R-squared:  0.9508 
## F-statistic: 561.2 on 8 and 224 DF,  p-value: < 2.2e-16
plot(trn_fit_hist)

TRX Summary Data and Plots for Models without Close History

print(summary(trn_fit_no_hist))
## 
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions + 
##     Distinct_Buyers + Distinct_Sellers, data = trn_fit_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.080304 -0.007163 -0.001766  0.003777  0.068697 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.037e-03  1.617e-03  -0.641    0.522    
## Avg_Tok_Amt       1.178e-17  7.153e-17   0.165    0.869    
## Tot_Tok_Amt      -4.220e-19  3.077e-19  -1.372    0.172    
## Transactions     -1.989e-05  2.809e-06  -7.081 1.72e-11 ***
## Distinct_Buyers   1.264e-04  7.336e-06  17.229  < 2e-16 ***
## Distinct_Sellers  2.007e-05  3.113e-06   6.449 6.57e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01628 on 230 degrees of freedom
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8074 
## F-statistic: 198.1 on 5 and 230 DF,  p-value: < 2.2e-16
plot(trn_fit_no_hist)

YOC Summary Data and Plots for Model with Close History

print(summary(yoc_fit_hist))
## 
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions + 
##     Distinct_Buyers + Distinct_Sellers + Close_m1 + Close_m2 + 
##     Close_m3, data = yoc_fit_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.052084 -0.001512 -0.000703  0.001339  0.044489 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.171e-03  7.241e-04   1.617 0.106554    
## Avg_Tok_Amt      -2.921e-25  5.028e-25  -0.581 0.561621    
## Tot_Tok_Amt      -4.565e-30  4.740e-27  -0.001 0.999232    
## Transactions     -9.850e-07  9.762e-07  -1.009 0.313549    
## Distinct_Buyers  -1.245e-05  2.951e-05  -0.422 0.673353    
## Distinct_Sellers  2.688e-06  5.304e-06   0.507 0.612531    
## Close_m1          5.855e-01  5.028e-02  11.644  < 2e-16 ***
## Close_m2          2.244e-01  5.732e-02   3.915 0.000106 ***
## Close_m3          1.595e-01  5.007e-02   3.185 0.001558 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007628 on 410 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9661, Adjusted R-squared:  0.9654 
## F-statistic:  1461 on 8 and 410 DF,  p-value: < 2.2e-16
plot(yoc_fit_hist)

YOC Summary Data and Plots for Models without Close History

print(summary(yoc_fit_no_hist))
## 
## Call:
## lm(formula = Close ~ Avg_Tok_Amt + Tot_Tok_Amt + Transactions + 
##     Distinct_Buyers + Distinct_Sellers, data = yoc_fit_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.064999 -0.026200 -0.005130  0.007418  0.138106 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.553e-02  3.040e-03  11.688  < 2e-16 ***
## Avg_Tok_Amt      -5.981e-24  2.411e-24  -2.481   0.0135 *  
## Tot_Tok_Amt      -4.511e-26  2.278e-26  -1.981   0.0483 *  
## Transactions     -2.628e-05  4.527e-06  -5.806 1.27e-08 ***
## Distinct_Buyers   1.118e-03  1.278e-04   8.749  < 2e-16 ***
## Distinct_Sellers  4.069e-05  2.521e-05   1.614   0.1072    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03683 on 416 degrees of freedom
## Multiple R-squared:  0.2544, Adjusted R-squared:  0.2454 
## F-statistic: 28.39 on 5 and 416 DF,  p-value: < 2.2e-16
plot(yoc_fit_no_hist)

Conclusion

We find that including the last three days of close prices really overpowers any gains we make via our engineered regressors. All three give us values over .95 for R^2 which is great! Unfortunately it this will not be able to predict quick spikes or drops in the price as it is simply going to estimate a linear trajectory based on the previous days’ action.

If we disregard the previous days’ close prices, we are able to get the follwing R^2 values after [manually] experimenting with K values representing the top K buyers. - OMG: 0.4418 (K=104) - TRN: ~0.8115 (K=~18,000) - YOC: 0.2551 (K=135)

Note that TRN’s K value which produced the highest R^2 Value was exceptionally high compared to OMG and YOC. We plan to explore why this was the case in our writeup.